{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import os\n",
    "files = os.listdir(\"../data\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "def fix_array(x):\n",
    "    x = np.fromstring(\n",
    "    x.replace('\\n','')\n",
    "    .replace('[','')\n",
    "    .replace(']','')\n",
    "    .replace('  ',' '), sep=' ')\n",
    "    return x.reshape((1, 768))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "healthtapQAs.csv\n",
      "questionDoctorQAs.csv\n",
      "webmdQAs.csv\n",
      "ehealthforumQAs.csv\n"
     ]
    }
   ],
   "source": [
    "qa = pd.read_csv(\"../data/\" + files[0])\n",
    "for file in files[1:]:\n",
    "    print(file)\n",
    "    qa = pd.concat([qa, pd.read_csv(\"../data/\" + file)], axis = 0)\n",
    "    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "qa[\"question_bert\"] = qa[\"question_bert\"].apply(fix_array)\n",
    "qa[\"answer_bert\"] = qa[\"answer_bert\"].apply(fix_array)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "qa = qa.sample(frac = 1)\n",
    "qa.reset_index(inplace = True, drop = True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "question_bert = np.concatenate(qa[\"question_bert\"].values, axis=0)\n",
    "answer_bert = np.concatenate(qa[\"answer_bert\"].values, axis=0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(166804, 768)"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "answer_bert.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "question_bert = question_bert.astype('float32')\n",
    "answer_bert = answer_bert.astype('float32')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "import faiss\n",
    "answer_index = faiss.IndexFlatL2(answer_bert.shape[-1])\n",
    "answer_index.add(answer_bert)\n",
    "\n",
    "question_index = faiss.IndexFlatL2(question_bert.shape[-1])\n",
    "question_index.add(question_bert)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[[160654 164703  30136  17242]]\n",
      "[[23.75791  27.490116 27.51597  27.605429]]\n",
      "[[125141      0  55077 142202]]\n",
      "[[ 0.       0.       0.      17.27046]]\n"
     ]
    }
   ],
   "source": [
    "k = 4\n",
    "D1, I1 = answer_index.search(question_bert[0:1].astype('float32'), k)\n",
    "D2, I2 = question_index.search(question_bert[0:1].astype('float32'), k)\n",
    "print(I1)\n",
    "print(D1)\n",
    "print(I2)\n",
    "print(D2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'i have rather white stools the past few days a bit unwell a bit gassy but no pain. i had gastric sleeve surgery 15 months ago. gall bladder?'"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "qa[\"question\"][0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'white stools or stools without their normal color suggests malabsorption of food and can be related to a liver gallbladder or intestinal issues.'"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "qa[\"answer\"][0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[160654, 164703, 30136, 17242]"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "list(I1[0])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>question</th>\n",
       "      <th>answer</th>\n",
       "      <th>question_bert</th>\n",
       "      <th>answer_bert</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>160654</th>\n",
       "      <td>i had a nissan fundoplication over 15 years ag...</td>\n",
       "      <td>i had a nissan fundoplication over 15 years ag...</td>\n",
       "      <td>[[0.0368798859, 0.564586043, -0.127868533, 0.3...</td>\n",
       "      <td>[[0.0565036982, 0.185404703, -0.204453021, 0.1...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>164703</th>\n",
       "      <td>hypothyroidism- can this make your head feel l...</td>\n",
       "      <td>i've been diagnosed recently with hypothyroidi...</td>\n",
       "      <td>[[-0.415793955, 0.0866782218, 0.0517509356, 0....</td>\n",
       "      <td>[[-0.308385283, 0.173175916, -0.0469668321, 0....</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>30136</th>\n",
       "      <td>is it possible hiatal hernia and intercostal m...</td>\n",
       "      <td>my spouse an asthmatic for 45 years recently h...</td>\n",
       "      <td>[[0.0144768339, 0.320690989, -0.241046146, 0.3...</td>\n",
       "      <td>[[-0.370323479, 0.348526865, -0.448246241, 0.0...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>17242</th>\n",
       "      <td>how long is a cold contagious?</td>\n",
       "      <td>i have had a bad chest cold since friday (toda...</td>\n",
       "      <td>[[0.0144032426, 0.0714931414, -0.162377745, -0...</td>\n",
       "      <td>[[-0.125337616, 0.288207948, -0.22237289, 0.06...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                                 question  \\\n",
       "160654  i had a nissan fundoplication over 15 years ag...   \n",
       "164703  hypothyroidism- can this make your head feel l...   \n",
       "30136   is it possible hiatal hernia and intercostal m...   \n",
       "17242                      how long is a cold contagious?   \n",
       "\n",
       "                                                   answer  \\\n",
       "160654  i had a nissan fundoplication over 15 years ag...   \n",
       "164703  i've been diagnosed recently with hypothyroidi...   \n",
       "30136   my spouse an asthmatic for 45 years recently h...   \n",
       "17242   i have had a bad chest cold since friday (toda...   \n",
       "\n",
       "                                            question_bert  \\\n",
       "160654  [[0.0368798859, 0.564586043, -0.127868533, 0.3...   \n",
       "164703  [[-0.415793955, 0.0866782218, 0.0517509356, 0....   \n",
       "30136   [[0.0144768339, 0.320690989, -0.241046146, 0.3...   \n",
       "17242   [[0.0144032426, 0.0714931414, -0.162377745, -0...   \n",
       "\n",
       "                                              answer_bert  \n",
       "160654  [[0.0565036982, 0.185404703, -0.204453021, 0.1...  \n",
       "164703  [[-0.308385283, 0.173175916, -0.0469668321, 0....  \n",
       "30136   [[-0.370323479, 0.348526865, -0.448246241, 0.0...  \n",
       "17242   [[-0.125337616, 0.288207948, -0.22237289, 0.06...  "
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "qa.loc[list(I1[0]), :]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>question</th>\n",
       "      <th>answer</th>\n",
       "      <th>question_bert</th>\n",
       "      <th>answer_bert</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>125141</th>\n",
       "      <td>i have rather white stools the past few days a...</td>\n",
       "      <td>indicate lack of pigment (of which bile (made ...</td>\n",
       "      <td>[[0.0354531258, 0.298021883, -0.208888337, 0.2...</td>\n",
       "      <td>[[0.0608750097, 0.1624486, -0.32438454, 0.1565...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>i have rather white stools the past few days a...</td>\n",
       "      <td>white stools or stools without their normal co...</td>\n",
       "      <td>[[0.0354531258, 0.298021883, -0.208888337, 0.2...</td>\n",
       "      <td>[[-0.265324056, 0.277673572, -0.438004732, -0....</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>55077</th>\n",
       "      <td>i have rather white stools the past few days a...</td>\n",
       "      <td>white pale stool may reflect a significant bil...</td>\n",
       "      <td>[[0.0354531258, 0.298021883, -0.208888337, 0.2...</td>\n",
       "      <td>[[-0.228493288, 0.0831475481, -0.440894783, -0...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>142202</th>\n",
       "      <td>i have had facial flushing for the past 6 mont...</td>\n",
       "      <td>go to see a rheumatologist he or she can sort ...</td>\n",
       "      <td>[[0.0747706592, 0.258431494, -0.0580760166, 0....</td>\n",
       "      <td>[[0.0863120854, 0.22896187, 0.172008887, 0.161...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                                 question  \\\n",
       "125141  i have rather white stools the past few days a...   \n",
       "0       i have rather white stools the past few days a...   \n",
       "55077   i have rather white stools the past few days a...   \n",
       "142202  i have had facial flushing for the past 6 mont...   \n",
       "\n",
       "                                                   answer  \\\n",
       "125141  indicate lack of pigment (of which bile (made ...   \n",
       "0       white stools or stools without their normal co...   \n",
       "55077   white pale stool may reflect a significant bil...   \n",
       "142202  go to see a rheumatologist he or she can sort ...   \n",
       "\n",
       "                                            question_bert  \\\n",
       "125141  [[0.0354531258, 0.298021883, -0.208888337, 0.2...   \n",
       "0       [[0.0354531258, 0.298021883, -0.208888337, 0.2...   \n",
       "55077   [[0.0354531258, 0.298021883, -0.208888337, 0.2...   \n",
       "142202  [[0.0747706592, 0.258431494, -0.0580760166, 0....   \n",
       "\n",
       "                                              answer_bert  \n",
       "125141  [[0.0608750097, 0.1624486, -0.32438454, 0.1565...  \n",
       "0       [[-0.265324056, 0.277673572, -0.438004732, -0....  \n",
       "55077   [[-0.228493288, 0.0831475481, -0.440894783, -0...  \n",
       "142202  [[0.0863120854, 0.22896187, 0.172008887, 0.161...  "
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "qa.loc[list(I2[0]), :]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
